/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

static const char __idstring[] = "@(#)$Id: post.c,v 1.34 2005/06/29 00:23:16 eugene Exp $";

/*
  This file contains routines associated with all types of posts.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/poll.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#include "mx_auto_config.h"
#include "myriexpress.h"
#include "internal.h"

/*
 * local prototypes
 */
void mx_accept_connections(struct mx_endpoint *);
void mx_check_incoming(struct mx_endpoint *, int32_t);

/*
 * Destroy structure describing a post
 */
void
mx_destroy_post(struct mx_post *post)
{
  /* Do any type-specific freeing */
  switch (post->type) {
  case MX_POST_TYPE_SEND:
    {
      struct mx_lib_send *send;
            
      send = &post->ts.send;
      if (send->seg_list != NULL) {
        mx_destroy_segment_list(send->seg_list);
      }
    }
    break;
        
  case MX_POST_TYPE_RECV:
    {
      struct mx_lib_recv *recv;
            
      recv = &post->ts.recv;
      if (recv->seg_list != NULL) {
        mx_destroy_segment_list(recv->seg_list);
      }
    }
    break;
        
  case MX_POST_TYPE_BCAST:
    {
      struct mx_lib_bcast *bcast;
            
      bcast = &post->ts.bcast;
      if (bcast->seg_list != NULL) {
        mx_destroy_segment_list(bcast->seg_list);
      }
    }
    break;
        
  case MX_POST_TYPE_BARRIER:
    {
      struct mx_lib_barrier *barrier;
      barrier = &post->ts.barrier;
      /* XXX nothing really needs to be done */
    }
    break;
        
  default:
    /* XXX */
    MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("destroying unknown post type %d\n", post->type));
    exit(1);
    break;
  }
  pthread_cond_destroy(&post->wait_cond);
  pthread_cond_destroy(&post->buffered_cond);
  free(post);
}

struct mx_post *
mx_new_post(struct mx_endpoint *endpoint,
            mx_post_type_t type,
            void *context)
{
  struct mx_post *post;
    
  post = (struct mx_post *)calloc(1, sizeof(*post));
    
  /* fill it in */
  post->endpoint = endpoint;
  post->type = type;
  post->status.context = context;
    
  /* post->complete = 0; */
  /* post->type_next = NULL; */
  pthread_cond_init(&post->wait_cond, NULL);
  pthread_cond_init(&post->buffered_cond, NULL);
  return post;
}

/*
 * Duplicate an address list
 */
mx_endpoint_addr_t *
mx_new_address_list(mx_endpoint_addr_t *list,
                    uint32_t num_endp)
{
  mx_endpoint_addr_t *new_list;
    
  new_list = (mx_endpoint_addr_t *)malloc(sizeof(*new_list) * num_endp);
  if (new_list == NULL) {
    return NULL;
  }
    
  /* fill in the array */
  memcpy(new_list, list, sizeof(*new_list) * num_endp);
    
  return new_list;
}

/*
 * Duplicate a segment list
 */
mx_segment_t *
mx_new_segment_list(mx_segment_t *list,
                    uint32_t num_seg)
{
  mx_segment_t *new_list;
    
  new_list = (mx_segment_t *)malloc(sizeof(*new_list) * num_seg);
  if (new_list == NULL) {
    return NULL;
  }
    
  /* fill in the array */
  memcpy(new_list, list, sizeof(*new_list) * num_seg);
    
  return new_list;
}

/*
 * Destroy a segment list
 */
void
mx_destroy_segment_list(mx_segment_t *seglist)
{
  free(seglist);
}

uint32_t
mx_segment_list_len(mx_segment_t *seg_list,
                    uint32_t seg_cnt)
{
  int i;
  uint32_t len;
    
  len = 0;
  for (i=0; i<seg_cnt; ++i) {
    len += seg_list[i].segment_length;
  }
    
  return len;
}

/*
 * Give all "asynchronous" I/O a chance to run.
 * Try to complete all pending sends.
 * Accept any pending connections
 * Poll all incoming sockets for data
 *
 * timeout specifies how long to wait for operations to complete in
 * milliseconds.  If timeout is -1, wait indefinitely.  timeout = 0 is
 * used to poll.
 *
 * return value is number of milliseconds we spend in the routine if timeout
 * is > 0, else return value is 0 (not relevant in those cases)
 */
void
mx_run_asynch_tx(struct mx_endpoint *endpoint)
{
  /* perform pending sends */
  struct mx_endpoint *ep;
  struct mx_endpoint *sep = NULL;
  struct mx_endpoint *bep = NULL;
  
  pthread_mutex_lock(&Mx_po_lock);
  if (Mx_init_count == 0){
    pthread_mutex_unlock(&Mx_po_lock);
    pthread_mutex_unlock(&Mx_tx_lock);
    pthread_exit(NULL);
  }
  for(ep = Mx_endpoints.next; ep != &Mx_endpoints; ep = ep->next){
    if (ep->send_list.next != &ep->send_list){
      sep = ep;
      break;
    }
    if (ep->bcast_list.next != &ep->bcast_list){
      bep = ep;
      break;
    }
  }
  pthread_mutex_unlock(&Mx_po_lock);
    
  if (sep) {
    (void) mx_run_send_queue(sep);
  } else if (bep) {
    (void) mx_run_bcast_queue(bep);
  } else {
    pthread_cond_wait(&Mx_send_cond, &Mx_tx_lock);
  }
}

void
mx_run_asynch_rx(struct mx_endpoint *endpoint)
{
  /* accept any pending connections */
  /* and poll for receives */
  mx_check_incoming(endpoint, 1000);
}

uint32_t
mx_run_send_queue(struct mx_endpoint *endpoint)
{
  struct mx_post *post;
  struct mx_post *next;
  /*    struct mx_post anchor; */
  struct mx_lib_send *send;
  int sent;
  int rc;
    
  sent = 0;	/* nothing sent yet */

    
    
  /* return if nothing to do */
  if (endpoint->send_list.next == &endpoint->send_list) {
    return 0;
  }
    
  /* steal the whole list since try_send may release lock */
  /*
    anchor.next = endpoint->send_list.next;
    anchor.prev = endpoint->send_list.prev;
    endpoint->send_list.next = &endpoint->send_list;
    endpoint->send_list.prev = &endpoint->send_list;
    anchor.next->prev = &anchor;
    anchor.prev->next = &anchor;
  */

  /* run through all the pending sends for this 
     endpoint and try to complete each */
  for (post = endpoint->send_list.next; post != &endpoint->send_list; ) {
    rc = mx_try_send(post);
        
    /* if send completed, move post record from send queue 
       to completed queue */
    if (rc) {
      next = post->next;	/* save next pointer */
            
      /* unlink from pending list */
      MX_LIST_REMOVE(post);

      if (post->type != MX_POST_TYPE_SEND){
        MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("this isn't a send!!!\n"));
        exit(1);
      }

      send = &post->ts.send;
      switch (send->type){
      case MX_SR_TYPE_SEND:
        {
          /* link onto end of completed list */
          pthread_mutex_lock(&Mx_po_lock);

          MX_LIST_INSERT(&endpoint->completed_posts, post);
                    
          /* Fill in status fields */
          post->status.code = MX_STATUS_SUCCESS;
          post->status.msg_length = post->ts.send.length;
          post->status.xfer_length = post->ts.send.length;
                    
          post->buffered = 1;     /* mark as buffered */
          pthread_cond_signal(&post->buffered_cond);
                    
          post->complete = 1;	/* mark as completed */
         
          pthread_cond_signal(&post->wait_cond);
          pthread_cond_signal(&endpoint->peek_cond);
          pthread_mutex_unlock(&Mx_po_lock);
                                        
        }
        break;
      case MX_SR_TYPE_ISSEND:
        {
          pthread_mutex_lock(&Mx_po_lock);
          MX_LIST_INSERT(&endpoint->buffered_posts, post);
          post->buffered = 1;
          pthread_cond_signal(&post->buffered_cond);
          pthread_mutex_unlock(&Mx_po_lock);

        }
        break;
      case MX_SR_TYPE_BARRIER:
      case MX_SR_TYPE_ISSEND_ACK:
      case MX_SR_TYPE_PUT_ACK:
      case MX_SR_TYPE_GET:
      case MX_SR_TYPE_GET_DATA:
        {
          /* sender of a barrier doesn't care when it's done */
          mx_destroy_post(post);
        }
        break;
      case MX_SR_TYPE_PUT:
        {
          if (post->status.code == MX_STATUS_REJECTED){
            pthread_mutex_lock(&Mx_po_lock);
            post->status.msg_length = post->ts.send.length;
            post->status.xfer_length = 0;
            post->buffered = 1;     /* mark as buffered */
            pthread_cond_signal(&post->buffered_cond);
            post->complete = 1;	/* mark as completed */
            MX_LIST_INSERT(&endpoint->completed_posts, post);
            pthread_cond_signal(&post->wait_cond);
            pthread_cond_signal(&endpoint->peek_cond);
            pthread_mutex_unlock(&Mx_po_lock);
          } else {
            pthread_mutex_lock(&Mx_po_lock);
            MX_LIST_INSERT(&endpoint->putget_posts, post);
            pthread_mutex_unlock(&Mx_po_lock);
          }
        }
        break;
      default:
        {
          MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("got send of unknown type %d\n", send->type));
          exit(1);
        }
      }
      sent = 1;	/* reendpoint that we sent something */
            
      /* advance to next post */
      post = next;
            
      /* if not complete, just move along to the next one */
    } else {
      post = post->next;
    }
        
  }
    
  /* put these guys back on the list */
  /*
    if (anchor.next != &anchor) {
    anchor.next->prev = &endpoint->send_list;
    anchor.prev->next = endpoint->send_list.next;
    endpoint->send_list.next->prev = anchor.prev;
    endpoint->send_list.next = anchor.next;
    }
  */

  return sent;
}

uint32_t
mx_run_bcast_queue(struct mx_endpoint *endpoint)
{
#if 0
  struct mx_post *post;
  struct mx_post *next;
  struct mx_lib_bcast *bcast;
  int sent;
  int rc;
    
  sent = 0;	/* nothing sent yet */
    
  /* run through all the pending sends for this endpoint and try to complete each */
  for (post = endpoint->bcast_list.next; post != &endpoint->bcast_list; ) {
    rc = mx_try_bcast(post);
        
    /* if send completed, move post record from send queue to completed queue */
    if (rc) {
      next = post->next;	/* save next pointer */
            
      /* unlink from pending list */
      MX_LIST_REMOVE(post);
            
      if (post->type != MX_POST_TYPE_BCAST) {
        /* XXX better error handling */
        MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("this isn't a bcast!!!\n"));
        exit(1);
      }
      bcast = &post->ts.bcast;
      if (bcast->type == MX_POST_TYPE_BCAST) {
        pthread_mutex_lock(&Mx_po_lock);

        /* link onto end of completed list */
        MX_LIST_INSERT(&endpoint->completed_posts, post);
        /* Fill in status fields */
        post->status.code = MX_STATUS_SUCCESS;
        post->status.msg_length = post->ts.bcast.length;
        post->status.xfer_length = post->ts.bcast.length;
               
        post->buffered = 1;
        post->complete = 1;	/* mark as completed */

        pthread_cond_signal(&post->buffered_cond);
        pthread_cond_signal(&post->wait_cond);
        pthread_cond_signal(&endpoint->peek_cond);
        pthread_mutex_unlock(&Mx_po_lock);
        
      }
      else if (bcast->type == MX_SR_TYPE_BARRIER_ACK) {
        pthread_mutex_lock(&Mx_po_lock);
        /* XXX combine with MX_POST_TYPE_BCAST */
        /* Fill in status fields */
        post->status.code = MX_STATUS_SUCCESS;
        post->status.msg_length = post->ts.bcast.length;
        post->status.xfer_length = post->ts.bcast.length;
        
        post->complete = 1;	/* mark as completed */
        post->buffered = 1;
        /* link onto end of completed list */
        MX_LIST_INSERT(&endpoint->completed_posts, post);
        pthread_cond_signal(&post->buffered_cond);
        pthread_cond_signal(&post->wait_cond);
        pthread_cond_signal(&endpoint->peek_cond);
        pthread_mutex_unlock(&Mx_po_lock);
      }
      else {
	/* XXX better error handling */
	MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("got a bcast of unknown type %d\n", post->type));
	exit(1);
      }
      sent = 1;			/* reendpoint that we sent something */

      /* advance to next post */
      post = next;

      /* if not complete, just move along to the next one */
    } else {
      post = post->next;
    }

  }

  return sent;
#else
  abort();
  return 0;
#endif
}

void
mx_check_incoming(struct mx_endpoint *endpoint,
                  int32_t timeout)
{

  struct mx_address_desc *ep;
  struct mx_address_desc **ep_cpy;
  int rc;
  int i, j;

#if 0
  MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("call poll, timeout = %d\n", timeout));
  {
    int i;
    MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("polling on:"));
    for (i=0; i<endpoint->sock_count; ++i) {
      MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("%d,", endpoint->read_sock[i].fd));
    }
    MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("\n"));
  }
#endif
    
  rc = poll(endpoint->read_sock, endpoint->sock_count, timeout);
  
  pthread_mutex_lock(&Mx_rx_lock);  
  pthread_mutex_lock(&Mx_po_lock);
  
  /* Remove dead sockets. */
  {
    struct mx_address_desc *x, *y;
        
    x = endpoint->known_addresss.next;
    while ((rc > 0) && (x != &endpoint->known_addresss)) {
      if (mx_endpoint_sock_bad(endpoint, x->recv_sock)) {
        MX_DEBUG_PRINT(MX_DEBUG_TCP_LIB,("removing %d\n", x->recv_sock));
        --rc;
        y = x;
        x = x->next;
        MX_LIST_REMOVE(y);
        mx_close_recv_socket(endpoint, y);
        free(y);
      }
      else {
        x = x->next;
      }
    }
  }
    
  /* Check for any incoming connections */
  if ((rc > 0) && (mx_endpoint_sock_ready(endpoint, endpoint->socket))) {
    mx_accept_connections(endpoint);
    --rc;
  }

  if (rc > 0){
    ep_cpy = (struct mx_address_desc **)
      malloc(rc * sizeof(struct mx_address_desc *));
    i = 0;
    ep = endpoint->known_addresss.next;
    while ((rc > 0) && (ep != &endpoint->known_addresss)) {
      /* check for data from this address */
      if (mx_endpoint_sock_ready(endpoint, ep->recv_sock)) {
        --rc;   /* decrement count of select()ed receives */
        ep_cpy[i++] = ep;
      }
      ep = ep->next;    /* go to next address */
    }

    pthread_mutex_unlock(&Mx_po_lock);

    for(j = 0; j < i; j++){
      mx_receive_data(endpoint, ep_cpy[j]);
    }
    
    free(ep_cpy);
    pthread_mutex_unlock(&Mx_rx_lock);
  } else {
    pthread_mutex_unlock(&Mx_po_lock);
    pthread_mutex_unlock(&Mx_rx_lock);
  }

}

/*
 * This is called when there is known to be at least one pending 
 * connection to be accepted.  Accept it, then poll and accept
 * connections until there are no more.
 */
void
mx_accept_connections(struct mx_endpoint *endpoint)
{
  fd_set sock_set;
  struct timeval tv;
  int rc;
    
  /* build list to look for - only need to do this once */
  FD_ZERO(&sock_set);
  FD_SET(endpoint->socket, &sock_set);
    
  /* we repeat this as long as we find pending connections */
  do {
        
    /* establish this connection */
    mx_establish_connection(endpoint);
        
    /* set no timeout so that we are polling */
    tv.tv_sec = 0;
    tv.tv_usec = 0;
        
    /* look for something to do */
    rc = select(endpoint->socket+1, &sock_set, NULL, NULL, &tv);
    if (rc == -1) {
      perror("select in mx_accept_connections");
    }
        
    /* checking both rc and FD_ISSET is belt and suspenders... */
  } while ((rc > 0) && FD_ISSET(endpoint->socket, &sock_set));
}

